Loading the data
## New names:
## Rows: 2607 Columns: 20
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): Country Name, continent dbl (18): ...1, Year, Agriculture, value added (%
## of GDP), CO2 emissions (me...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
head(data_frame)
## # A tibble: 6 × 20
## ...1 `Country Name` Year Agriculture, value added (…¹ CO2 emissions (metri…²
## <dbl> <chr> <dbl> <dbl> <dbl>
## 1 0 Afghanistan 1962 NA 0.0738
## 2 1 Afghanistan 1967 NA 0.124
## 3 2 Afghanistan 1972 NA 0.131
## 4 3 Afghanistan 1977 NA 0.183
## 5 4 Afghanistan 1982 NA 0.166
## 6 5 Afghanistan 1987 NA 0.276
## # ℹ abbreviated names: ¹​`Agriculture, value added (% of GDP)`,
## # ²​`CO2 emissions (metric tons per capita)`
## # ℹ 15 more variables:
## # `Domestic credit provided by financial sector (% of GDP)` <dbl>,
## # `Electric power consumption (kWh per capita)` <dbl>,
## # `Energy use (kg of oil equivalent per capita)` <dbl>,
## # `Exports of goods and services (% of GDP)` <dbl>, …
Data Visualization
filtered_data = data_frame |>
filter(Year == 1962)
filtered_data |>
select(starts_with('CO2'), gdpPercap) |>
rename(co2 = starts_with('CO2')) |>
filter(if_all(everything(), ~ !is.na(.))) |>
ggplot() +
geom_point(aes(x = gdpPercap, y = co2)) +
labs(
x = 'GDP per capita',
y = 'CO2 emissions (metric tons per capita)',
title = 'CO2 emissions per capita generally increases with GDP'
)

correlation between CO2 emission and GDP growth
correlation_columns = filtered_data |>
select(starts_with('CO2'), gdpPercap) |>
rename(co2 = starts_with('CO2')) |>
filter(if_all(everything(), ~ !is.na(.)))
#since the tow variables are quantitative continuous, i used Pearson correlation
#by default the cor() function will use Pearson method
#the function output a value of 0.9260817 indicating there is a positive relation between the two variables (evolving in the same direction)
cor(correlation_columns$co2,
correlation_columns$gdpPercap,
method = 'pearson')
## [1] 0.9260817
#the p_vlaue is used to assess the probability of getting a correlation coefficient as extreme as r, if we sample from a population when the null hypothesis r = 0 is true
cor.test(correlation_columns$co2,
correlation_columns$gdpPercap,
method = 'pearson')
##
## Pearson's product-moment correlation
##
## data: correlation_columns$co2 and correlation_columns$gdpPercap
## t = 25.269, df = 106, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8934697 0.9489792
## sample estimates:
## cor
## 0.9260817
data_frame |>
select(
Year,
co2 = starts_with('CO2'),
gdpPercap
) |>
filter(!is.na(co2), !is.na(gdpPercap)) |>
group_by(Year) |>
summarise(
r = cor(co2, gdpPercap),
.groups = 'drop'
) |>
ggplot() +
geom_bar(aes(x = Year, y = r),
stat = 'identity') +
geom_text(aes(x = Year, y = r, label = round(r, 2)),
vjust = -0.5) +
scale_x_continuous(breaks = sort(unique(data_frame$Year))) +
theme(legend.position = 'none')

library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot = data_frame |>
filter(Year == 1967) |>
select( co2 = starts_with('CO2'),
gdpPercap,
pop,
continent) |>
ggplot() +
geom_point(aes(x = gdpPercap, y = co2, size = pop, color = continent)) +
scale_x_continuous(
labels = label_dollar(scale = 1/1000, suffix = 'K')
) +
coord_cartesian(
xlim = c(0,20000),
ylim = c(0,25)
) +
labs(
x = 'GDP per capita',
y = 'CO2 emissions (metric tons per capita)'
)
ggplotly(plot)